Using the Tara Oceans metagenomic and metatranscriptomic data, Metagenome Assembled Genomes identified as Dictyochophyceae were isolated as a case study to address some key questions.
(1) Can we use amplicon data to benchmark what we see in the eukaryotic MAGs? (2) What is the distribution of the Dictyochophyceae MAGs? and how does the biogeography of the MAGs vary according to MAG content, coverage, or other parameters? (3) What are the ecological differences among the detected Dictyochophyceae MAGs?
Amplicon Sequence Variants (ASVs) ASV taxonomy table from Benjamin Callahan. (2017). ASV Tables inferred by DADA2 from the TARA Oceans v9 metabarcoding dataset [Data set]. Zenodo. http://doi.org/10.5281/zenodo.581694. ASVs originate from de Vargas, C., Audic, S., Henry, N., Decelle, J., Mahé, F., Logares, R., … Karsenti, E. (2015, May 22). First Tara Oceans V9 rDNA metabarcoding dataset. Zenodo. http://doi.org/10.5281/zenodo.15600, where the V9 hypervariable region was amplified and sequenced. Tara Ocean ASVs total to 107,868 total and cover 299 samples.
Dictyochophyceae MAGs and relative abundances Metagenomic and metatranscriptomic reads were mapped against the microbial eukaryotic MAGs determined from the Tara Ocean dataset. Mapped reads were used to estimate relative abundances, where the total number of reads was used to calculate RPKM. TPM was also calculated, which involved normalizing the RPKM to the sum of all RPKMs in a given sample.
** Input MAGs are from ** /vortexfs1/omics/alexander/halexander/2020-tara-mag-abund/MAG_rpkm.csv and from the 50% cutoff high quality euk MAGs. MAG relative abundance input data frame lists of the name of the MAG (n=3554 total line), then the run accession that it was found in, the relative abundance, and the taxonomic identity. The relative abundance adds up to 100 for each ERR. ERRs covered total to 298 samples.
Clarify links to data origin here.
# library(dada2)
# Import data from Callahan et al.
# tara <- readRDS("st.consensus.rds")
# Assign taxonomy
# tara_tax <- assignTaxonomy(tara, "/vortexfs1/omics/huber/shu/db/pr2-db/pr2_version_4.12.0_18S_dada2.fasta.gz", taxLevels = c("Kingdom","Supergroup","Division","Class","Order","Family","Genus","Species"), multithread = TRUE)
## taxonmoy with the PR2 (v4.12) database done separately
# Saved output as R data object "tara-taxassign-30-06-2020.RData"
#
# Import results from taxonomy assignment
# load("tara-taxassign-30-06-2020.RData", verbose = T)
# tara_df <- as.data.frame(t(tara)) # format the count table
# head(tara_tax_df[1:2,]) #taxonomy assignment# Compile & save text file with annoated data
# tara_df_annot <- tara_df %>%
# rownames_to_column(var = "seq") %>%
# pivot_longer(cols = starts_with("ERR")) %>%
# left_join((tara_tax_df %>% rownames_to_column(var = "seq"))) %>%
# # left_join(meta, by = c("name" = "INSDC.run.accession.number.s.")) %>%
# select(-seq) %>%
# data.frame
# head(tara_df_annot[1:2,])
# dim(tara_df_annot)Import metadata for ASVs
# meta <- read.csv("metabar.csv"); meta$X <- NULL
# metaASV_2 <- meta %>%
# select(Sample.material = Sample.label..TARA_station._environmental.feature_size.fraction.,
# Sample..Sequence.Identifier, run_accession = INSDC.run.accession.number.s.,
# Station = Station.identifier..TARA_station..,
# Latitude = Latitude..degrees.North., Longitude = Longitude..degrees.East.,
# Depth = Sampling.depth..m., Environmental.Feature,
# SizeFrac_low = Size.fraction.lower.threshold..micrometre.,
# SizeFrac_upper = Size.fraction.upper.threshold..micrometre.,
# MP.biom = Marine.pelagic.biomes..Longhurst.2007.,
# OS.region = Ocean.and.sea.regions..IHO.General.Sea.Areas.1953...MRGID.registered.at.www.marineregions.com.,
# BG.provine = Marine.pelagic.biomes...Longhurst.2007...MRGID.registered.at.www.marineregions.com.) %>%
# add_column(study_accession = "PRJEB6610") %>%
# data.frame
# colnames(metaASV_2) <- paste0("ASV_", colnames(metaASV_2))
# # head(metaASV_2[1:2,])
#
# metaASV_3 <- metaASV_2 %>%
# select(SizeFrac_low = ASV_SizeFrac_low,
# SizeFrac_upper = ASV_SizeFrac_upper,
# Lat = ASV_Latitude,
# Long = ASV_Longitude,
# everything()) %>%
# data.frameMAG-based refinement tools may not fully capture the taxonomic assignment for a given microbial eukaryotic species. Here, we compare the relative abundances of ASVs and MAGs, for every sample in which they both appear, in order to supplement the ecological and biological findings for a given MAG. This may serve to benchmark the biogeography for a given taxonomic lineage or additional taxonomic resolution (i.e., assignment to the species level).
## Loading objects:
## mag_relab
## asv_relab
# Future SH note - sort out ASV files and get the correct ones in order
# Use asv_relab from 'inputdfs'
# Modify sequence IDs
## Save for downstream analysis
asv_seqIDs <- asv_relab %>%
select(seq) %>%
distinct() %>%
rowid_to_column("seq_num") %>%
mutate(seq_id = paste("ASV-", seq_num, sep = ""))
head(asv_seqIDs)## seq_num
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## seq
## 1 GTCGCTACTACCGATTGAACGTTTTAGTGAGGTCCTCGGACTGTTTGGTAGTCGGATCACTCTGACTGCCTGGCGGGAAGACGACCAAACTGTAGCGTTTAGAGGAAGTAAAAGTCGTAACAAGGTTTCC
## 2 GTCGCTACTACCGATTGAACGTTTTAGTGAGGTCCTCGGACTGTTTGCCTGGCGGATTACTCTGCCTGGCTGGCGGGAAGACGACCAAACTGTAGCGTTTAGAGGAAGTAAAAGTCGTAACAAGGTTTCC
## 3 GTCGCTACTACCGATTGAACGTTTTAGTGAGGTCCTCGGACTGTGAGCCTGGCGGGTCATTCTGCCTGGTCTGCGGGAAGACGACCAAACTGTAGCGTTTAGAGGAAGTAAAAGTCGTAACAAGGTTTCC
## 4 GTCGCTCCTACCGATTGAATACGTTGGTGATTGAATTGGATAAAGAGATATCATCTTAAATGATAGCAAAGCGGTAAACATTTGTAAACTAGATTATTTAGAGGAAGGAGAAGTCGTAACAAGGTTTCC
## 5 GTCGCTACTACCGATTGAACGTTTTAGTGAGGTCCTCGGATTGTGATTCAGCTGGTTCACGCTGACTGTTTTGCGAGAAGACGACCAAACTGAAGCGTTTAGAGGAAGTAAAAGTCGTAACAAGGTTTCC
## 6 GTCGCTACTACCGATTGAACGTTTTAGTGAGGTATTTGGACTGGGCCTTGGGAGGATTCGTTCTCCCATGTTGCTCGGGAAGACTCCCAAACTTGAGCGTTTAGAGGAAGTAAAAGTCGTAACAAGGTTTCC
## seq_id
## 1 ASV-1
## 2 ASV-2
## 3 ASV-3
## 4 ASV-4
## 5 ASV-5
## 6 ASV-6
# Import MAG counts
mag_counts_updated <- read.csv("input-data/MAG_tpm.csv")
# Import MAG taxonomic assignments (Feb 2, 2021)
mag_taxonomy <- read.csv("input-data/2021-marzoanmmetsp-estimated-taxonomy-levels.csv")metaT_metadata <- read.delim("input-data/SampleList_2020_metaT.txt")
metaG_metadata <- read.delim("input-data/SampleList_2020_metaG.txt")
tmp_metadata <- read.delim("input-data/PRJEB4352_metaG_wenv.txt")# dic<-c("MS-all-SRF-0-8-5-00_bin-158","MS-all-SRF-0-8-5-00_bin-588","IO-all-SRF-0-8-5-00_bin-80","SAO-all-SRF-0-8-5-00_bin-590")
# tmp_subset <- filter(mag_taxonomy, mag %in% dic)In total, there are only 2 MAGs that were assigned to the class Dictyochophyceae.
## [1] - Cryptophyceae Prymnesiophyceae Dictyochophyceae
## [5] Chloropicophyceae Spirotrichea Mamiellophyceae Branchiopoda
## [9] Hexanauplia Palmophyllophyceae Pelagophyceae Bacillariophyta
## [13] Bolidophyceae Ascomycota Prasino-Clade-V Actinopteri
## [17] Anthozoa Gastropoda Pyramimonadales
## 19 Levels: - Actinopteri Anthozoa Ascomycota Bacillariophyta ... Spirotrichea
Import metadata and clean up
## Sub_region Depth_sizefrac ERR_count
## 1 SPO-all DCM-0.8-5.00 26
## 2 SPO-all DCM-180-2000.00 16
## 3 SPO-all SRF-0.8-5.00 43
## 4 SPO-all SRF-180-2000.00 33
## 5 SPO-all SRF-20-180.00 31
## 6 SPO-all SRF-5-20.00 23
## ERR_list
## 1 ERR1726527, ERR1726579, ERR1726584, ERR1726609, ERR1726631, ERR1726842, ERR1726858, ERR868358, ERR868364, ERR868381, ERR868415, ERR868419, ERR868422, ERR868424, ERR868425, ERR868432, ERR868438, ERR868440, ERR868446, ERR868453, ERR868457, ERR868458, ERR868465, ERR868505, ERR873961, ERR873966
## 2 ERR1700911, ERR1726550, ERR1726552, ERR1726566, ERR1726604, ERR1726623, ERR1726627, ERR1726698, ERR1726778, ERR1726806, ERR1726826, ERR1726880, ERR1726882, ERR1726900, ERR1726922, ERR1726962
## 3 ERR1726556, ERR1726564, ERR1726567, ERR1726569, ERR1726602, ERR1726628, ERR1726634, ERR1726643, ERR1726645, ERR1726656, ERR1726667, ERR1726714, ERR1726719, ERR1726725, ERR1726749, ERR1726762, ERR1726846, ERR1726851, ERR1726907, ERR1726913, ERR868352, ERR868357, ERR868363, ERR868382, ERR868404, ERR868405, ERR868413, ERR868416, ERR868436, ERR868442, ERR868444, ERR868447, ERR868462, ERR868466, ERR868469, ERR868475, ERR868476, ERR868489, ERR868493, ERR868499, ERR868501, ERR868502, ERR868513
## 4 ERR1726541, ERR1726543, ERR1726561, ERR1726586, ERR1726593, ERR1726600, ERR1726608, ERR1726620, ERR1726624, ERR1726657, ERR1726664, ERR1726685, ERR1726694, ERR1726705, ERR1726716, ERR1726739, ERR1726740, ERR1726741, ERR1726745, ERR1726763, ERR1726765, ERR1726770, ERR1726775, ERR1726790, ERR1726795, ERR1726876, ERR1726883, ERR1726891, ERR1726895, ERR1726909, ERR1726916, ERR1726931, ERR1726946
## 5 ERR1726524, ERR1726525, ERR1726528, ERR1726544, ERR1726558, ERR1726577, ERR1726588, ERR1726592, ERR1726598, ERR1726601, ERR1726605, ERR1726615, ERR1726622, ERR1726639, ERR1726666, ERR1726670, ERR1726695, ERR1726709, ERR1726721, ERR1726755, ERR1726785, ERR1726834, ERR1726850, ERR1726857, ERR1726892, ERR1726903, ERR1726927, ERR1726932, ERR1726952, ERR1726963, ERR1726970
## 6 ERR1726522, ERR1726533, ERR1726535, ERR1726571, ERR1726594, ERR1726632, ERR1726688, ERR1726691, ERR1726708, ERR1726713, ERR1726732, ERR1726735, ERR1726779, ERR1726822, ERR1726841, ERR1726865, ERR1726878, ERR1726890, ERR1726896, ERR1726912, ERR1726938, ERR1726943, ERR1726961
## Assembly_group
## 1 SPO-all-DCM-0.8-5.00
## 2 SPO-all-DCM-180-2000.00
## 3 SPO-all-SRF-0.8-5.00
## 4 SPO-all-SRF-180-2000.00
## 5 SPO-all-SRF-20-180.00
## 6 SPO-all-SRF-5-20.00
metaG_reformat <- metaG_metadata %>%
separate_rows(ERR_list, sep = ", ") %>%
separate(Sub_region, c("REGION", "subregion"), sep = "-", remove = FALSE) %>%
separate(Depth_sizefrac, c("DEPTH", "min", "max"), sep = "-", remove = FALSE) %>%
unite(SIZEFRAC, min, max, sep = "-") %>%
select(-Assembly_group, -ERR_count, run_accession = "ERR_list", REGION, DEPTH, SIZEFRAC) %>%
left_join(tmp_metadata %>% select(run_accession, sampleid = Sample.material)) %>%
distinct()
# unique(metaG_metadata$Sub_region)
# head(metaG_reformat)# tmp_metadata <- read.delim("input-data/PRJEB4352_metaG_wenv.txt")
#
# metaG_metadata <- tmp_metadata %>%
# select(study_accession, sample_accession, run_accession, experiment_alias,
# sample_alias, sample_title, Campaign, Station, Device, Event,
# Latitude, Longitude, Env.feature, Depth = Depth..nominal,
# SizeFrac_low = Fraction.lower..µm., SizeFrac_upper = Fraction.upper..µm.,
# Sample.material, Sample.label,
# MP.biome, OS.region, BG.province) %>%
# mutate(SizeFrac_upper = str_replace(SizeFrac_upper, ">3.00", ">")) %>%
# mutate(SizeFrac_upper = str_replace(SizeFrac_upper, ">5.00", ">")) %>%
# mutate(SizeFrac_upper = str_replace(SizeFrac_upper, ">0.80", ">")) %>%
# mutate(SizeFrac_upper = str_replace(SizeFrac_upper, "2000.00", "2000")) %>%
# mutate(SizeFrac_upper = str_replace(SizeFrac_upper, "180.00", "180")) %>%
# mutate(SizeFrac_upper = str_replace(SizeFrac_upper, "20.00", "20")) %>%
# mutate(SizeFrac_upper = str_replace(SizeFrac_upper, "5.00", "5")) %>%
# mutate(SizeFrac_upper = str_replace(SizeFrac_upper, "3.00", "3")) %>%
# data.frame
# Reformat column names and class
# colnames(metaG_metadata) <- paste0("MAG_", colnames(metaG_metadata))
# metaG_metadata$MAG_SizeFrac_upper <- as.factor(metaG_metadata$MAG_SizeFrac_upper)
# metaG_metadata$MAG_Depth <- as.factor(metaG_metadata$MAG_Depth)
#
# # Get sample list information along with ERRs
# mag_sampleid <- metaG_metadata %>%
# select(sampleid = MAG_Sample.material, run_accession = MAG_run_accession) %>%
# distinct()
# head(metaG_reformat)Description of data frame structure.
Compile MAGs of interest and subset to include distribution and taxonomy information. Calculate relative abundance of MAGs for each Tara ocean sample.
# head(mag_counts_updated[1:2,])
mags_relab <- mag_taxonomy %>%
# Join with count data, only keep those with taxonomy names
inner_join(mag_counts_updated, by = c("mag" = "Genome")) %>%
pivot_longer(cols = starts_with("ERR"), names_to = "run_accession", values_to = "tpm") %>%
group_by(run_accession) %>%
# Calculate relative abundance
mutate(relabun = ((tpm / sum(tpm))*100)) %>%
# Join with sampleID information (used to match with ASVs)
left_join(metaG_reformat) %>%
unite(fullname, supergroup, division, class, order, family, genus, species, sep = ";", remove = FALSE) %>%
mutate(fullname = gsub(";-", "", fullname)) %>%
data.frame## Joining, by = "run_accession"
## [1] 994
## mag domain fullname supergroup division class
## 1 SAO-all-DCM-0-8-5-00_bin-388 Eukaryota - - - -
## 2 SAO-all-DCM-0-8-5-00_bin-388 Eukaryota - - - -
## 3 SAO-all-DCM-0-8-5-00_bin-388 Eukaryota - - - -
## 4 SAO-all-DCM-0-8-5-00_bin-388 Eukaryota - - - -
## 5 SAO-all-DCM-0-8-5-00_bin-388 Eukaryota - - - -
## 6 SAO-all-DCM-0-8-5-00_bin-388 Eukaryota - - - -
## order family genus species run_accession tpm relabun Sub_region
## 1 - - - - ERR1700889 0.2934997 5.545726e-05 IO-all
## 2 - - - - ERR1700890 0.6981203 2.295297e-04 IO-all
## 3 - - - - ERR1700891 19.7895972 2.730741e-03 NAO-all
## 4 - - - - ERR1700892 10.1474404 5.431486e-03 IO-all
## 5 - - - - ERR1700893 100.6220940 3.280503e-02 NAO-all
## 6 - - - - ERR1700894 52.6206170 6.147862e-02 MS-all
## REGION subregion Depth_sizefrac DEPTH SIZEFRAC sampleid
## 1 IO all SRF-180-2000.00 SRF 180-2000.00 TARA_039_SRF_180-2000
## 2 IO all SRF-5-20.00 SRF 5-20.00 TARA_041_SRF_5-20
## 3 NAO all DCM-5-20.00 DCM 5-20.00 TARA_142_DCM_5-20
## 4 IO all ZZZ-0.8-5.00 ZZZ 0.8-5.00 TARA_040_ZZZ_3->
## 5 NAO all SRF-5-20.00 SRF 5-20.00 TARA_143_SRF_5-20
## 6 MS all DCM-5-20.00 DCM 5-20.00 TARA_009_DCM_5-20
# Check relative abundance
# tmp <- filter(mags_relab, run_accession == "ERR1700891")
# sum(tmp$relab) # Sum to 100For each sample (or ERRxxx), the relative abundances of ASVs or MAGs totals to 100%.
Report summary of the total MAGs and ASVs to compare.
# Total MAGs and ASVs
writeLines(paste("there are", length(unique(mags_relab$mag)), "total MAGs, which cover", length(unique(mags_relab$sampleid)), "samples."))## there are 994 total MAGs, which cover 633 samples.
writeLines(paste("there are", length(unique(asv_relab$seq)), "total ASVs, which cover",length(unique(asv_relab$sampleid)), "samples."))## there are 107868 total ASVs, which cover 299 samples.
# Overlap
writeLines(paste(dim(mags_relab %>%
select(sampleid) %>%
distinct() %>%
inner_join(select(asv_relab, sampleid)) %>%
filter(!is.na(sampleid)) %>%
distinct())[1], "samples overlap betwen MAG- and ASV-derived data"))## Joining, by = "sampleid"
## 298 samples overlap betwen MAG- and ASV-derived data
Remove NAs ahead of time.
# Summary of upper (Taxonomic) level IDs
summary_mag <- mags_relab %>% select(supergroup, division, class, mag) %>%
group_by(class) %>% mutate(totalMAGs = n(), totaluniqMAGs = n_distinct(mag)) %>%
select(-mag) %>% distinct() %>% arrange(supergroup, division)
# summary_mag
# unique(summary_mag$supergroup)
# colnames(summary_mag)
ggplot(summary_mag %>% filter(!(class == "-")), aes(x = division, y = totalMAGs, fill = class)) +
geom_bar(stat = "identity", color = "black") +
facet_grid(supergroup ~ ., space = "free", scale = "free") +
theme_linedraw() +
coord_flip() +
scale_y_continuous(expand = c(0,0)) +
theme(axis.text.x = element_text(angle = 0),
strip.background = element_blank(),
legend.position = "bottom") +
labs(x = "", y = "Total MAGs", title = "MAGs by class")# head(asv_relab)
summary_asv <- asv_relab %>% select(supergroup, division, class, seq) %>%
group_by(class) %>% mutate(totalASVs = n(), totaluniqASVs = n_distinct(seq)) %>%
select(-seq) %>% distinct() %>% arrange(supergroup, division)
# head(summary_asv)
ggplot(summary_asv %>% filter(!is.na(division)), aes(x = supergroup, y = totalASVs, fill = division)) +
geom_bar(stat = "identity", color = "black") +
facet_grid(supergroup ~ ., space = "free", scale = "free") +
theme_linedraw() +
coord_flip() +
scale_y_continuous(expand = c(0,0)) +
theme(axis.text.x = element_text(angle = 0),
strip.background = element_blank(),
legend.position = "bottom") +
labs(x = "", y = "Total ASVs", title = "ASVs by class")Function to match ASV results to MAG results
Below function will map all ASVs to MAGs based on taxonomic name. Function will retain only samples that appear in both ASV and MAG data. Mapping relies on matching the sample ID (Tara station #, size fraction, depth) and taxonomic grouping. Then will subset so that the number of ASV-MAG occurrences is > freq. Then, function performs linear regression, keeps positive slopes, and subsets the highest r.squared value for a MAG-ASV match. This is all written to output filename.
Plan to look into the Dictyochophyceae (class level Silicoflagellates within the Ochrophyta).
check_mag() function serves to compare how many taxonomic levels between the ASV and MAG results match over a number of samples. User defined the taxonomic “level” (e.g., class or order) and output reports the number of MAGs and ASVs that match.
# Check MAG data for taxa of interest
check_mag <- function(mag_df, asv_df, level){
level <- enquo(level)
mag_out <- mag_df %>%
unite(TAXA, supergroup:!!level, remove = TRUE, sep =";")
asv_out <- asv_df %>%
unite(TAXA, supergroup:!!level, remove = TRUE, sep =";")
writeLines(paste(length(unique(mag_out$TAXA)), "unique in MAG data"))
writeLines(paste(length(unique(asv_out$TAXA)), "unique in ASV data"))
unique(mag_out$TAXA)
}
# check_mag(mag_relab, asv_relab, class)Check how many matches there are between MAG and ASV data at the class level
# Output lists matches to the CLASS level.
## There are 16 unique tax IDs to the class level from the MAGs and 191 unique class level designations in the ASV data
check_mag(mags_relab, asv_relab, class)## 30 unique in MAG data
## 191 unique in ASV data
## [1] "-;-;-"
## [2] "Hacrobia;Cryptophyta;Cryptophyceae"
## [3] "Stramenopiles;Ochrophyta;-"
## [4] "Hacrobia;Haptophyta;Prymnesiophyceae"
## [5] "Stramenopiles;Ochrophyta;Dictyochophyceae"
## [6] "Metazoa;Arthropoda;-"
## [7] "Metazoa;-;-"
## [8] "Proteobacteria;-;-"
## [9] "Archaeplastida;Chlorophyta;Chloropicophyceae"
## [10] "Alveolata;Ciliophora;Spirotrichea"
## [11] "Archaeplastida;Chlorophyta;Mamiellophyceae"
## [12] "Archaeplastida;Chlorophyta;-"
## [13] "Metazoa;Arthropoda;Branchiopoda"
## [14] "Metazoa;Arthropoda;Hexanauplia"
## [15] "Archaeplastida;Chlorophyta;Palmophyllophyceae"
## [16] "Stramenopiles;Ochrophyta;Pelagophyceae"
## [17] "Alveolata;Ciliophora;-"
## [18] "Stramenopiles;Ochrophyta;Bacillariophyta"
## [19] "Stramenopiles;Ochrophyta;Bolidophyceae"
## [20] "Opisthokonta;Fungi;Ascomycota"
## [21] "Archaeplastida;Chlorophyta;Prasino-Clade-V"
## [22] "Metazoa;Chordata;Actinopteri"
## [23] "Stramenopiles;-;-"
## [24] "Alveolata;Dinoflagellata;-"
## [25] "Hacrobia;Haptophyta;-"
## [26] "Metazoa;Cnidaria;Anthozoa"
## [27] "Metazoa;Mollusca;Gastropoda"
## [28] "Metazoa;Cnidaria;-"
## [29] "Archaeplastida;Chlorophyta;Pyramimonadales"
## [30] "Amoebozoa;-;-"
# head(asv_relab)
# At the order level, there are 18 unique tax desigations to the order level in the MAG data and over 380 in the ASV data
# check_mag(mag_relab, asv_relab, order)Note the group: “Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X”, that ends with *_X*. This indicates that there is not any more meaningful information beyond Dictyochophyceae. Plan to explore this further.
We’ve now compared the MAG and ASV taxonomic assignments to the class level to see how many overlap. Here, we will pull out those ASVs that are identified as Dictyochophytes and have similar relative abundance patterns to the MAGs across Tara Ocean samples.
## [1] "mag" "domain" "fullname" "supergroup"
## [5] "division" "class" "order" "family"
## [9] "genus" "species" "run_accession" "tpm"
## [13] "relabun" "Sub_region" "REGION" "subregion"
## [17] "Depth_sizefrac" "DEPTH" "SIZEFRAC" "sampleid"
## [1] "seq" "run_accession" "fullname" "supergroup"
## [5] "division" "class" "order" "family"
## [9] "genus" "species" "relabun" "sampleid"
Subsetting at the class level for dictyochophytes and matching MAGs and ASVs seems most appropriate.
map_ASVtoMAG_bylevel <- function(mag_df, asv_df, subset_level, subset_taxa, map_level, freq){
subset_level <- enquo(subset_level)
map_level <- enquo(map_level)
mag_df %>%
filter(!!subset_level == subset_taxa) %>%
# Classify to genus level
unite(TAXA, supergroup:!!map_level, remove = FALSE, sep =";") %>%
# Classify ASVs to genus level and merge by genus and sample ID
left_join(unite(asv_df, TAXA, supergroup:!!map_level, remove = FALSE, sep =";"),
by = c("TAXA"="TAXA", "sampleid" = "sampleid"), suffix = c(".MAG", ".ASV")) %>%
filter(!is.na(run_accession.ASV) & !is.na(sampleid)) %>% # Remove samples without an ASV
group_by(mag, seq) %>% # Selects correct grouping
mutate(OCCUR = n_distinct(sampleid)) %>% # Count occurences and ratio of relative abundances
filter(OCCUR >= freq) %>% #ASV-MAG incidence must occur more than predetermined # of times
data.frame
}Below function subsets the ASV and MAG dataframe to select only entries that match to the class level and include “Dictyochophyceae”. After matching ASVs and MAGs, only keep those MAG-ASV matches where there are at least 5 samples that include both ASV and MAGs. Increasing this last number may improve statistical power of the MAG and ASV match.
Matching the MAGs to the ASVs that all fell within the Ochrophyta category (division level). For a MAG and ASV, if both division level assignments were ochrophyta and the ASV and MAG appeared in the same 20 samples, they matched.
Since few of the dictyochophyte
# dictyo_class <- map_ASVtoMAG_bylevel(mag_relab, asv_relab, class, "Dictyochophyceae", class, 0)
ochro_div <- map_ASVtoMAG_bylevel(mags_relab, asv_relab, division, "Ochrophyta", division, 20)Over 30,000 pairs of MAG-ASV matched. Meaning, there were over 30,000 instances of a MAG and ASV that had at least the Ochrophyta division level assignment and appeared in the same sample.
## [1] "mag" "domain" "fullname.MAG"
## [4] "TAXA" "supergroup.MAG" "division.MAG"
## [7] "class.MAG" "order.MAG" "family.MAG"
## [10] "genus.MAG" "species.MAG" "run_accession.MAG"
## [13] "tpm" "relabun.MAG" "Sub_region"
## [16] "REGION" "subregion" "Depth_sizefrac"
## [19] "DEPTH" "SIZEFRAC" "sampleid"
## [22] "seq" "run_accession.ASV" "fullname.ASV"
## [25] "supergroup.ASV" "division.ASV" "class.ASV"
## [28] "order.ASV" "family.ASV" "genus.ASV"
## [31] "species.ASV" "relabun.ASV" "OCCUR"
## [1] 30883 2
Ecological inquiry into how we can augment taxonomy assignment for MAGs.
Can we expand the number of MAGs that represent Dictyochophytes?
First, find those dictyochophyte ASV-MAG pairs Subset by Dictyochophyte assignment - where either the ASV or MAG had a dictyochophyte assignment.
ochro_sub_dictyo <- ochro_div %>%
mutate(dictyo_mag = ifelse(class.MAG == "Dictyochophyceae", "true", "false"),
dictyo_asv = ifelse(class.ASV == "Dictyochophyceae", "true", "false")) %>%
unite(category, dictyo_mag, dictyo_asv, sep = "-") %>%
mutate(dictyo = case_when(
category == "true-true" ~ "both",
category == "true-false" ~ "MAG",
category == "false-true" ~ "ASV",
TRUE ~ "none"
)) %>%
filter(!(dictyo == "none"))
head(ochro_sub_dictyo)## mag domain fullname.MAG
## 1 MS-all-DCM-0-8-5-00_bin-318 Eukaryota Stramenopiles;Ochrophyta
## 2 MS-all-DCM-0-8-5-00_bin-318 Eukaryota Stramenopiles;Ochrophyta
## 3 MS-all-DCM-0-8-5-00_bin-318 Eukaryota Stramenopiles;Ochrophyta
## 4 MS-all-DCM-0-8-5-00_bin-318 Eukaryota Stramenopiles;Ochrophyta
## 5 MS-all-DCM-0-8-5-00_bin-318 Eukaryota Stramenopiles;Ochrophyta
## 6 MS-all-DCM-0-8-5-00_bin-318 Eukaryota Stramenopiles;Ochrophyta
## TAXA supergroup.MAG division.MAG class.MAG order.MAG
## 1 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## 2 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## 3 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## 4 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## 5 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## 6 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## family.MAG genus.MAG species.MAG run_accession.MAG tpm relabun.MAG
## 1 - - - ERR1700889 0.17678640 3.340409e-05
## 2 - - - ERR1700894 12.40358542 1.449157e-02
## 3 - - - ERR1700894 12.40358542 1.449157e-02
## 4 - - - ERR1700894 12.40358542 1.449157e-02
## 5 - - - ERR1700894 12.40358542 1.449157e-02
## 6 - - - ERR1700897 0.03206138 9.206607e-05
## Sub_region REGION subregion Depth_sizefrac DEPTH SIZEFRAC
## 1 IO-all IO all SRF-180-2000.00 SRF 180-2000.00
## 2 MS-all MS all DCM-5-20.00 DCM 5-20.00
## 3 MS-all MS all DCM-5-20.00 DCM 5-20.00
## 4 MS-all MS all DCM-5-20.00 DCM 5-20.00
## 5 MS-all MS all DCM-5-20.00 DCM 5-20.00
## 6 SPO-SPSG SPO SPSG DCM-5-20.00 DCM 5-20.00
## sampleid
## 1 TARA_039_SRF_180-2000
## 2 TARA_009_DCM_5-20
## 3 TARA_009_DCM_5-20
## 4 TARA_009_DCM_5-20
## 5 TARA_009_DCM_5-20
## 6 TARA_098_DCM_5-20
## seq
## 1 GTCGCACCTACCGATTGAATGGTTCGGTGAGGTCTCAGGATCGTGGCTTAGCACCTTCATTGGAGCTCTGCCGTGAGAATTTGCCCAAACCTCATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## 2 GTCGCACCTACCGATTGAATGGTTCGGTGAGGTCTCAGGATCGTGGCTTAGCACCTTCATTGGAGCTCTGCCGTGAGAATTTGCCCAAACCTCATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## 3 GTCGCACCTACCGATCGAAGGTAATGATGAGCCCTCAGGACAGTTTTGTGTTTCCTTCACGGGTTACCAAGACTGAAATTTGTGCAAATCCTTGCCTTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## 4 GTCGCACCTACCGATGAACAGATCGATGAGGCATGAGGAGTACCGCTTGAACCGGCAACGGATCTTGTGGCGCGAATTTTTCCAAATCTCTTTGTTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## 5 GTCGCACCTACCGATCGAAGGTAATGATGAGCCCTCAGGACAGTCTTCTGTTCGCTTCACGGCTTACTGAGATCTGAAATTTGTGCAAATCCTTGCCTTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## 6 GTCGCACCTACCGATTGAATGGTTCGGTGAGGTCTCAGGATCGTGGCTTAGCACCTTCATTGGAGCTCTGCCGTGAGAATTTGCCCAAACCTCATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## run_accession.ASV
## 1 ERR562617
## 2 ERR562464
## 3 ERR562464
## 4 ERR562464
## 5 ERR562464
## 6 ERR562433
## fullname.ASV
## 1 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Florenciellales;Pseudochattonella;Pseudochattonella verruculosa
## 2 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Florenciellales;Pseudochattonella;Pseudochattonella verruculosa
## 3 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Pedinellales;Pedinella;Pedinella sp.
## 4 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Dictyochales;Dictyochales_X;Dictyochales_X sp.
## 5 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Pedinellales;Pedinella;Pedinella sp.
## 6 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Florenciellales;Pseudochattonella;Pseudochattonella verruculosa
## supergroup.ASV division.ASV class.ASV order.ASV
## 1 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## 2 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## 3 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## 4 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## 5 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## 6 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## family.ASV genus.ASV species.ASV relabun.ASV
## 1 Florenciellales Pseudochattonella Pseudochattonella verruculosa 0.0004224400
## 2 Florenciellales Pseudochattonella Pseudochattonella verruculosa 0.0052543637
## 3 Pedinellales Pedinella Pedinella sp. 0.0099832911
## 4 Dictyochales Dictyochales_X Dictyochales_X sp. 0.0026271819
## 5 Pedinellales Pedinella Pedinella sp. 0.0026271819
## 6 Florenciellales Pseudochattonella Pseudochattonella verruculosa 0.0002441347
## OCCUR category dictyo
## 1 204 false-true ASV
## 2 204 false-true ASV
## 3 163 false-true ASV
## 4 83 false-true ASV
## 5 85 false-true ASV
## 6 204 false-true ASV
## [1] 1403047 35
##
## ASV both MAG
## 1212345 27870 162832
What is the result of this matching? * Total number of MAGs and ASVs included in matches Over 3600 unique MAG-ASV pairs
Perform linear regression to see if ASV relative abundance supports MAG relative abundance. `est_lm() function performs linear regression for each MAG-ASV pair. User defines the number of samples that the ASV-MAG occurrences much appear and the r squared threshold.
# Function to estimate slope and r squared of ASV and MAG pairs
## Options to subset by frequency again and r-squared
est_lm <- function(df, freq, r){
df_line <- df %>%
nest(-mag, -seq) %>%
mutate(lm_fit = map(data, ~ lm(relabun.MAG ~ relabun.ASV, data = .)),
tidied = map(lm_fit, tidy)) %>%
unnest(tidied) %>%
select(mag, seq, term, estimate) %>%
pivot_wider(names_from = term, values_from = estimate) %>%
data.frame
# Modify column names
colnames(df_line) <- c("mag", "seq", "Intercept", "Slope")
# Re-do lm() to select rsquared value
df_lm <- df %>%
nest(-mag, -seq) %>%
mutate(lm_fit = map(data, ~ lm(relabun.MAG ~ relabun.ASV, data = .)),
glanced = map(lm_fit, glance)) %>%
unnest(glanced) %>%
select(mag, seq, r.squared) %>%
# Join with other lm information
right_join(df_line, by = c("mag"="mag", "seq"="seq")) %>%
# Join with all other metadata for mag-seq pair
right_join(df, by = c("mag"="mag", "seq"="seq")) %>%
# Filter by positive slope, occurence, and rsquared value
filter(Slope > 0) %>%
filter(OCCUR > freq) %>%
filter(r.squared > r) %>%
data.frame
tmp_stats <- df_lm %>%
select(mag, seq, OCCUR, r.squared) %>%
distinct()
# Report:
writeLines(paste(range(tmp_stats$r.squared)[1], " min of r^2 values"))
writeLines(paste(range(tmp_stats$r.squared)[2], " max of r^2 values"))
writeLines(paste(length(unique(tmp_stats$mag))," total MAGs"))
writeLines(paste(length(unique(tmp_stats$seq))," total ASVs"))
writeLines(paste(dim(tmp_stats)[1]," total comparisons that will be plot"))
return(df_lm)
}Below, using the subset of putative dictyochophyte MAGs (derived from potential ASV co-occurrence), run linear regression to find best fit. Subset results so that r squared value is greater than 0.6.
## Warning: All elements of `...` must be named.
## Did you want `data = c(domain, fullname.MAG, TAXA, supergroup.MAG, division.MAG, class.MAG,
## order.MAG, family.MAG, genus.MAG, species.MAG, run_accession.MAG,
## tpm, relabun.MAG, Sub_region, REGION, subregion, Depth_sizefrac,
## DEPTH, SIZEFRAC, sampleid, run_accession.ASV, fullname.ASV,
## supergroup.ASV, division.ASV, class.ASV, order.ASV, family.ASV,
## genus.ASV, species.ASV, relabun.ASV, OCCUR, category, dictyo)`?
## Warning: All elements of `...` must be named.
## Did you want `data = c(domain, fullname.MAG, TAXA, supergroup.MAG, division.MAG, class.MAG,
## order.MAG, family.MAG, genus.MAG, species.MAG, run_accession.MAG,
## tpm, relabun.MAG, Sub_region, REGION, subregion, Depth_sizefrac,
## DEPTH, SIZEFRAC, sampleid, run_accession.ASV, fullname.ASV,
## supergroup.ASV, division.ASV, class.ASV, order.ASV, family.ASV,
## genus.ASV, species.ASV, relabun.ASV, OCCUR, category, dictyo)`?
## 0.600256106583639 min of r^2 values
## 0.960203968221247 max of r^2 values
## 42 total MAGs
## 15 total ASVs
## 74 total comparisons that will be plot
Output from linear regression function reports the min and max of r-squared values, the total MAGs and ASVs used for comparison and the total number of pairwise comparisons that need to be represented.
Reformat data frame to make plots.
# Explore and plot the comparison between MAG and ASVs for dictyochophytes
# From 249 comparisons to be plot, subsample to more promising comparisons
## And reformat
# head(asv_seqIDs)
dictyo_lm_toplot <- dictyo_lm %>%
# Select minimum R squared value
filter(r.squared >= 0.6) %>%
# import ASV sequence IDs
left_join(asv_seqIDs) %>%
# Extract highest resolution of taxonomic name for MAGs and ASVs
mutate(MAG_tax = str_extract(fullname.MAG, '\\b[^;]+$'),
ASV_tax = str_extract(fullname.ASV, '\\b[^;]+$'),
MAG_ID = paste("MAG", str_extract(mag, '\\b[^-]+$'), MAG_tax, sep = "-"),
ASV_ID = paste(seq_id, ASV_tax, sep = "-")) %>%
data.frame## Joining, by = "seq"
## mag
## 1 MS-all-DCM-0-8-5-00_bin-318
## 2 MS-all-DCM-0-8-5-00_bin-318
## 3 MS-all-DCM-0-8-5-00_bin-318
## 4 MS-all-DCM-0-8-5-00_bin-318
## 5 MS-all-DCM-0-8-5-00_bin-318
## 6 MS-all-DCM-0-8-5-00_bin-318
## seq
## 1 GTCGCACCTACCGATCGAAGGTAATGATGAGCCCTCAGGATTGGAACTTGGATCGGGTTTCCCGAACCTTGTGCCGAGAATCTGTGCAAATCCCTACCTTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## 2 GTCGCACCTACCGATCGAAGGTAATGATGAGCCCTCAGGATTGGAACTTGGATCGGGTTTCCCGAACCTTGTGCCGAGAATCTGTGCAAATCCCTACCTTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## 3 GTCGCACCTACCGATCGAAGGTAATGATGAGCCCTCAGGATTGGAACTTGGATCGGGTTTCCCGAACCTTGTGCCGAGAATCTGTGCAAATCCCTACCTTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## 4 GTCGCACCTACCGATCGAAGGTAATGATGAGCCCTCAGGATTGGAACTTGGATCGGGTTTCCCGAACCTTGTGCCGAGAATCTGTGCAAATCCCTACCTTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## 5 GTCGCACCTACCGATCGAAGGTAATGATGAGCCCTCAGGATTGGAACTTGGATCGGGTTTCCCGAACCTTGTGCCGAGAATCTGTGCAAATCCCTACCTTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## 6 GTCGCACCTACCGATCGAAGGTAATGATGAGCCCTCAGGATTGGAACTTGGATCGGGTTTCCCGAACCTTGTGCCGAGAATCTGTGCAAATCCCTACCTTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## r.squared Intercept Slope domain fullname.MAG
## 1 0.6834365 0.007994976 11.61199 Eukaryota Stramenopiles;Ochrophyta
## 2 0.6834365 0.007994976 11.61199 Eukaryota Stramenopiles;Ochrophyta
## 3 0.6834365 0.007994976 11.61199 Eukaryota Stramenopiles;Ochrophyta
## 4 0.6834365 0.007994976 11.61199 Eukaryota Stramenopiles;Ochrophyta
## 5 0.6834365 0.007994976 11.61199 Eukaryota Stramenopiles;Ochrophyta
## 6 0.6834365 0.007994976 11.61199 Eukaryota Stramenopiles;Ochrophyta
## TAXA supergroup.MAG division.MAG class.MAG order.MAG
## 1 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## 2 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## 3 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## 4 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## 5 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## 6 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## family.MAG genus.MAG species.MAG run_accession.MAG tpm relabun.MAG
## 1 - - - ERR1700903 0.000000 0.000000000
## 2 - - - ERR1700912 2.364306 0.001766583
## 3 - - - ERR1726556 12.718982 0.005777532
## 4 - - - ERR1726556 12.718982 0.005777532
## 5 - - - ERR1726556 12.718982 0.005777532
## 6 - - - ERR1726556 12.718982 0.005777532
## Sub_region REGION subregion Depth_sizefrac DEPTH SIZEFRAC sampleid
## 1 SPO-SPSG SPO SPSG DCM-5-20.00 DCM 5-20.00 TARA_100_DCM_5-20
## 2 IO-all IO all DCM-5-20.00 DCM 5-20.00 TARA_041_DCM_5-20
## 3 SPO-all SPO all SRF-0.8-5.00 SRF 0.8-5.00 TARA_122_SRF_0.8->
## 4 SPO-all SPO all SRF-0.8-5.00 SRF 0.8-5.00 TARA_122_SRF_0.8->
## 5 SPO-SPSG SPO SPSG SRF-0.8-5.00 SRF 0.8-5.00 TARA_122_SRF_0.8->
## 6 SPO-SPSG SPO SPSG SRF-0.8-5.00 SRF 0.8-5.00 TARA_122_SRF_0.8->
## run_accession.ASV
## 1 ERR562723
## 2 ERR562623
## 3 ERR562568
## 4 ERR562568
## 5 ERR562568
## 6 ERR562568
## fullname.ASV
## 1 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Rhizochromulinales;Rhizochromulina;Rhizochromulina marina
## 2 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Rhizochromulinales;Rhizochromulina;Rhizochromulina marina
## 3 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Rhizochromulinales;Rhizochromulina;Rhizochromulina marina
## 4 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Rhizochromulinales;Rhizochromulina;Rhizochromulina marina
## 5 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Rhizochromulinales;Rhizochromulina;Rhizochromulina marina
## 6 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Rhizochromulinales;Rhizochromulina;Rhizochromulina marina
## supergroup.ASV division.ASV class.ASV order.ASV
## 1 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## 2 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## 3 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## 4 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## 5 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## 6 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## family.ASV genus.ASV species.ASV relabun.ASV OCCUR
## 1 Rhizochromulinales Rhizochromulina Rhizochromulina marina 0.0009968400 33
## 2 Rhizochromulinales Rhizochromulina Rhizochromulina marina 0.0004877335 33
## 3 Rhizochromulinales Rhizochromulina Rhizochromulina marina 0.0009265393 33
## 4 Rhizochromulinales Rhizochromulina Rhizochromulina marina 0.0009265393 33
## 5 Rhizochromulinales Rhizochromulina Rhizochromulina marina 0.0009265393 33
## 6 Rhizochromulinales Rhizochromulina Rhizochromulina marina 0.0009265393 33
## category dictyo seq_num seq_id MAG_tax ASV_tax
## 1 false-true ASV 5815 ASV-5815 Ochrophyta Rhizochromulina marina
## 2 false-true ASV 5815 ASV-5815 Ochrophyta Rhizochromulina marina
## 3 false-true ASV 5815 ASV-5815 Ochrophyta Rhizochromulina marina
## 4 false-true ASV 5815 ASV-5815 Ochrophyta Rhizochromulina marina
## 5 false-true ASV 5815 ASV-5815 Ochrophyta Rhizochromulina marina
## 6 false-true ASV 5815 ASV-5815 Ochrophyta Rhizochromulina marina
## MAG_ID ASV_ID
## 1 MAG-318-Ochrophyta ASV-5815-Rhizochromulina marina
## 2 MAG-318-Ochrophyta ASV-5815-Rhizochromulina marina
## 3 MAG-318-Ochrophyta ASV-5815-Rhizochromulina marina
## 4 MAG-318-Ochrophyta ASV-5815-Rhizochromulina marina
## 5 MAG-318-Ochrophyta ASV-5815-Rhizochromulina marina
## 6 MAG-318-Ochrophyta ASV-5815-Rhizochromulina marina
## [1] 42
Supplementary figure that shows all ASV-MAG pairs.
# svg("figs/Supplementary-dictyochophyte-allpairs.svg", h = 11, w = 15)
dictyo_lm_toplot %>%
ggplot(aes(x = relabun.ASV, y = relabun.MAG, fill = ASV_ID)) +
geom_abline(intercept = 0, slope = 1, alpha = .5) + # Line of perfect fit
geom_abline(aes(group = ASV_ID, slope = Slope,
intercept = Intercept, color = ASV_ID)) +
geom_dl(aes(label = round(r.squared, 2), color = ASV_ID),
method = list("smart.grid", cex = 0.8)) +
geom_point(aes(fill = ASV_ID), color = "black", size = 2, shape = 21) +
facet_wrap(MAG_ID ~.,scales = "free") +
theme_linedraw() +
theme(axis.text = element_text(color = "black", face = "bold"),
strip.background = element_blank(),
strip.text = element_text(size = 8, color = "black", hjust = 1, vjust = -1),
legend.position = "right",
legend.title = element_blank(),
plot.margin = unit(c(1,3,1,1), "lines")) +
labs(x = "MAG relative abundance", y = "ASV relative abundance",
title = "") +
guides(fill = guide_legend(ncol = 1),
col = guide_legend(ncol = 1))Probably come back to this plot, for the main text and evaluate which one to plot as an example… likely the core MAGs chosen to look at. and perhaps those that show some trophic differences of interest.
From ASV comparison (n=42). r^2 value must be > 0.6, and ASV-MAG occurrence needs to be > 25.
## mag
## 1 MS-all-DCM-0-8-5-00_bin-318
## 2 MS-all-DCM-0-8-5-00_bin-318
## 3 MS-all-DCM-0-8-5-00_bin-318
## 4 MS-all-DCM-0-8-5-00_bin-318
## 5 MS-all-DCM-0-8-5-00_bin-318
## 6 MS-all-DCM-0-8-5-00_bin-318
## seq
## 1 GTCGCACCTACCGATCGAAGGTAATGATGAGCCCTCAGGATTGGAACTTGGATCGGGTTTCCCGAACCTTGTGCCGAGAATCTGTGCAAATCCCTACCTTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## 2 GTCGCACCTACCGATCGAAGGTAATGATGAGCCCTCAGGATTGGAACTTGGATCGGGTTTCCCGAACCTTGTGCCGAGAATCTGTGCAAATCCCTACCTTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## 3 GTCGCACCTACCGATCGAAGGTAATGATGAGCCCTCAGGATTGGAACTTGGATCGGGTTTCCCGAACCTTGTGCCGAGAATCTGTGCAAATCCCTACCTTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## 4 GTCGCACCTACCGATCGAAGGTAATGATGAGCCCTCAGGATTGGAACTTGGATCGGGTTTCCCGAACCTTGTGCCGAGAATCTGTGCAAATCCCTACCTTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## 5 GTCGCACCTACCGATCGAAGGTAATGATGAGCCCTCAGGATTGGAACTTGGATCGGGTTTCCCGAACCTTGTGCCGAGAATCTGTGCAAATCCCTACCTTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## 6 GTCGCACCTACCGATCGAAGGTAATGATGAGCCCTCAGGATTGGAACTTGGATCGGGTTTCCCGAACCTTGTGCCGAGAATCTGTGCAAATCCCTACCTTTAGAGGAAGGTGAAGTCGTAACAAGGTTACC
## r.squared Intercept Slope domain fullname.MAG
## 1 0.6834365 0.007994976 11.61199 Eukaryota Stramenopiles;Ochrophyta
## 2 0.6834365 0.007994976 11.61199 Eukaryota Stramenopiles;Ochrophyta
## 3 0.6834365 0.007994976 11.61199 Eukaryota Stramenopiles;Ochrophyta
## 4 0.6834365 0.007994976 11.61199 Eukaryota Stramenopiles;Ochrophyta
## 5 0.6834365 0.007994976 11.61199 Eukaryota Stramenopiles;Ochrophyta
## 6 0.6834365 0.007994976 11.61199 Eukaryota Stramenopiles;Ochrophyta
## TAXA supergroup.MAG division.MAG class.MAG order.MAG
## 1 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## 2 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## 3 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## 4 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## 5 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## 6 Stramenopiles;Ochrophyta Stramenopiles Ochrophyta - -
## family.MAG genus.MAG species.MAG run_accession.MAG tpm relabun.MAG
## 1 - - - ERR1700903 0.000000 0.000000000
## 2 - - - ERR1700912 2.364306 0.001766583
## 3 - - - ERR1726556 12.718982 0.005777532
## 4 - - - ERR1726556 12.718982 0.005777532
## 5 - - - ERR1726556 12.718982 0.005777532
## 6 - - - ERR1726556 12.718982 0.005777532
## Sub_region REGION subregion Depth_sizefrac DEPTH SIZEFRAC sampleid
## 1 SPO-SPSG SPO SPSG DCM-5-20.00 DCM 5-20.00 TARA_100_DCM_5-20
## 2 IO-all IO all DCM-5-20.00 DCM 5-20.00 TARA_041_DCM_5-20
## 3 SPO-all SPO all SRF-0.8-5.00 SRF 0.8-5.00 TARA_122_SRF_0.8->
## 4 SPO-all SPO all SRF-0.8-5.00 SRF 0.8-5.00 TARA_122_SRF_0.8->
## 5 SPO-SPSG SPO SPSG SRF-0.8-5.00 SRF 0.8-5.00 TARA_122_SRF_0.8->
## 6 SPO-SPSG SPO SPSG SRF-0.8-5.00 SRF 0.8-5.00 TARA_122_SRF_0.8->
## run_accession.ASV
## 1 ERR562723
## 2 ERR562623
## 3 ERR562568
## 4 ERR562568
## 5 ERR562568
## 6 ERR562568
## fullname.ASV
## 1 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Rhizochromulinales;Rhizochromulina;Rhizochromulina marina
## 2 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Rhizochromulinales;Rhizochromulina;Rhizochromulina marina
## 3 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Rhizochromulinales;Rhizochromulina;Rhizochromulina marina
## 4 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Rhizochromulinales;Rhizochromulina;Rhizochromulina marina
## 5 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Rhizochromulinales;Rhizochromulina;Rhizochromulina marina
## 6 Stramenopiles;Ochrophyta;Dictyochophyceae;Dictyochophyceae_X;Rhizochromulinales;Rhizochromulina;Rhizochromulina marina
## supergroup.ASV division.ASV class.ASV order.ASV
## 1 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## 2 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## 3 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## 4 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## 5 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## 6 Stramenopiles Ochrophyta Dictyochophyceae Dictyochophyceae_X
## family.ASV genus.ASV species.ASV relabun.ASV OCCUR
## 1 Rhizochromulinales Rhizochromulina Rhizochromulina marina 0.0009968400 33
## 2 Rhizochromulinales Rhizochromulina Rhizochromulina marina 0.0004877335 33
## 3 Rhizochromulinales Rhizochromulina Rhizochromulina marina 0.0009265393 33
## 4 Rhizochromulinales Rhizochromulina Rhizochromulina marina 0.0009265393 33
## 5 Rhizochromulinales Rhizochromulina Rhizochromulina marina 0.0009265393 33
## 6 Rhizochromulinales Rhizochromulina Rhizochromulina marina 0.0009265393 33
## category dictyo seq_num seq_id MAG_tax ASV_tax
## 1 false-true ASV 5815 ASV-5815 Ochrophyta Rhizochromulina marina
## 2 false-true ASV 5815 ASV-5815 Ochrophyta Rhizochromulina marina
## 3 false-true ASV 5815 ASV-5815 Ochrophyta Rhizochromulina marina
## 4 false-true ASV 5815 ASV-5815 Ochrophyta Rhizochromulina marina
## 5 false-true ASV 5815 ASV-5815 Ochrophyta Rhizochromulina marina
## 6 false-true ASV 5815 ASV-5815 Ochrophyta Rhizochromulina marina
## MAG_ID ASV_ID
## 1 MAG-318-Ochrophyta ASV-5815-Rhizochromulina marina
## 2 MAG-318-Ochrophyta ASV-5815-Rhizochromulina marina
## 3 MAG-318-Ochrophyta ASV-5815-Rhizochromulina marina
## 4 MAG-318-Ochrophyta ASV-5815-Rhizochromulina marina
## 5 MAG-318-Ochrophyta ASV-5815-Rhizochromulina marina
## 6 MAG-318-Ochrophyta ASV-5815-Rhizochromulina marina
mag_ASVbased <- select(dictyo_lm_toplot, mag) %>% distinct() %>%
add_column(ASV_based = TRUE)
dim(mag_ASVbased)## [1] 42 2
From EUKulele lowered threshold (n=33).
euk_thres <- read.delim("input-data/list_dictyochophyceae.txt", header = FALSE)
# head(euk_thres)
dim(euk_thres)## [1] 33 1
Incorporate BUSCO completeness and contamination to pare down the list
Combined MAG lists (n=43)
dictyo_comparison <- euk_thres %>%
add_column(EUK_based = TRUE) %>%
mutate(mag = gsub("-max-level.csv", "", V1)) %>%
select(-V1, mag, EUK_based) %>%
full_join(mag_ASVbased) %>%
left_join(buscos, by = c("mag" = "X"))## Joining, by = "mag"
## EUK_based mag ASV_based Completeness Contamination
## 1 TRUE IO-all-DCM-0-8-5-00_bin-130 NA 12.156863 0.0000000
## 2 TRUE IO-all-SRF-0-8-5-00_bin-113 NA 21.960784 0.3921569
## 3 TRUE IO-all-SRF-0-8-5-00_bin-351 NA 16.862745 0.3921569
## 4 TRUE IO-all-SRF-0-8-5-00_bin-80 NA 72.549020 2.3529412
## 5 TRUE MS-all-DCM-0-8-5-00_bin-257 TRUE 12.941176 0.0000000
## 6 TRUE MS-all-DCM-0-8-5-00_bin-290 TRUE 6.666667 0.0000000
## [1] 57 5
# tmp <- filter(dictyo_comparison, mag %in% dic) %>% select(mag) %>% distinct()
# View(tmp)
# View(dictyo_comparison)
# write.table(dictyo_comparison %>% select(mag), file = "all-putative-dictyophyte-mags.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)
# ?write.table()Get stats on comparisons. Plan to subset MAGs based on the fact that they were assigned Dictyochophyte class, but lower than the default EUKulele settings, and they had >40% compelteness. We will come back to evaluate the ASV assignment here.
# Number that overlap
putative_dictyo <- dictyo_comparison %>%
# Subset so that the EUKulele assignment (>0%) is prioritized over ASV-based evidence
filter(!is.na(EUK_based)) %>%
# filter(!is.na(ASV_based)) %>%
# filter(!(Completeness >= 40)) %>%
# Subset so BUSOC is >40% compelte
filter(Completeness >= 40) %>%
data.frame
dim(putative_dictyo) # n = 11## [1] 11 5
# View(putative_dictyo)
# write.table(tmp %>% select(mag), file = "actual-putative-dictyophyte-mags.txt",
# quote = FALSE, row.names = FALSE, col.names = FALSE)Of the 11 chosen as putative dictyochophyte MAGs, 6 were also found to strongly correspond with ASV relative abundances. To see if we can obtain more detailed taxonomic assignment from those MAGs, plot the ASV-MAG comparison.
# Subset for ASV main text plot
asv_of_interest <- as.character((putative_dictyo %>%
filter(EUK_based == TRUE) %>%
select(mag))$mag)
asv_of_interest## [1] "IO-all-SRF-0-8-5-00_bin-80" "MS-all-SRF-0-8-5-00_bin-158"
## [3] "MS-all-SRF-0-8-5-00_bin-250" "MS-all-SRF-0-8-5-00_bin-582"
## [5] "MS-all-SRF-0-8-5-00_bin-588" "NAO-all-MIX-0-8-5-00_bin-170"
## [7] "SAO-all-SRF-0-8-5-00_bin-111" "SAO-all-SRF-0-8-5-00_bin-590"
## [9] "SPO-all-DCM-0-8-5-00_bin-501" "SPO-all-SRF-0-8-5-00_bin-318"
## [11] "SPO-SPSG-SRF-0-8-5-00_bin-290"
# head(dictyo_lm_toplot)
# Subset the dictyo plot data frame
dictyo_asvs <- filter(dictyo_lm_toplot, mag %in% asv_of_interest)
# length(unique(dictyo_asvs$mag))
# length(unique(dictyo_asvs$ASV_ID))
# View(select(dictyo_asvs, mag, ASV_ID) %>% distinct())Plot 6 putative dictyochophyte MAGs with ASV comparisons
dictyo_bestfit <- dictyo_asvs %>%
ggplot(aes(x = relabun.ASV, y = relabun.MAG, shape = MAG_ID, color = ASV_ID, fill = ASV_ID)) +
geom_abline(intercept = 0, slope = 1, alpha = .5) + # Line of perfect fit
geom_abline(aes(group = seq, slope = Slope,
intercept = Intercept, color = ASV_ID)) +
# geom_dl(aes(label = round(r.squared, 3), color = ASV_tax),
# method = list("last.points", cex = 0.8)) +
# geom_dl(aes(label = round(r.squared, 2), color = ASV_tax),
# method = list("smart.grid", cex = 0.8)) +
geom_point(aes(shape = MAG_ID, color = ASV_ID, fill = ASV_ID), size = 2) +
# scale_shape_manual(values = c(21, 22, 23, 24)) +
scale_fill_brewer(palette = "Dark2") +
scale_color_brewer(palette = "Dark2") +
# scale_linetype_manual(values = c(4,2,1,4)) +
scale_x_continuous(limits = c(0,3.5), expand = c(0,0)) +
scale_y_continuous(limits = c(0,3.5), expand = c(0,0)) +
# scale_y_log10() + scale_x_log10() +
theme_linedraw() +
theme(axis.text = element_text(color = "black", face = "bold"),
strip.background = element_blank(),
strip.text = element_text(size = 8, color = "black", hjust = 1, vjust = -1),
legend.position = "right",
legend.title = element_blank(),
plot.margin = unit(c(1,3,1,1), "lines")) +
labs(x = "MAG relative abundance", y = "ASV relative abundance",
title = "") +
guides(fill = guide_legend(ncol = 1),
col = guide_legend(ncol = 1))
# svg("bestfit-dictyochophyte-panel.svg", h=5,w=7)
dictyo_bestfitSubset the list of all metatranscriptome samples within the small size fraction. Map these transcripts to the putative dictyochophyte MAGs.
# head(metaT_metadata)
# metaT_small_sizefrac <- metaT_metadata %>%
# select(Depth_sizefrac, ERR_list) %>%
# separate_rows(ERR_list, sep = ", ") %>%
# separate(Depth_sizefrac, c("DEPTH", "min", "max"), sep = "-", remove = FALSE) %>%
# unite(SIZEFRAC, min, max, sep = "-") %>%
# filter(SIZEFRAC == "0.8-5.00") %>%
# data.frame
# head(metaT_small_sizefrac)
# unique(metaT_small_sizefrac$SIZEFRAC)
# write.table(metaT_small_sizefrac$ERR_list, file = "ERR-metaT-smallsizefrac.txt",
# col.names = FALSE, row.names = FALSE, quote = FALSE)# Select mags of interest
mags_of_interest <- as.character((putative_dictyo %>%
select(mag))$mag)
length(mags_of_interest) # n = 11## [1] 11
Isolate TPM MAG abundances.
# Import MAG counts (TPM)
mag_counts_updated <- read.csv("input-data/MAG_tpm.csv")
# head(mag_counts_updated)
# head(metaG_reformat)
dictyo_mag_tpm <- mag_counts_updated %>%
filter(Genome %in% mags_of_interest) %>%
pivot_longer(cols = starts_with("ERR"), names_to = "run_accession", values_to = "tpm") %>%
left_join(metaG_reformat) %>%
data.frame## Joining, by = "run_accession"
## Genome run_accession tpm Sub_region REGION
## 1 SAO-all-SRF-0-8-5-00_bin-590 ERR1700889 0.2356308 IO-all IO
## 2 SAO-all-SRF-0-8-5-00_bin-590 ERR1700890 0.1858975 IO-all IO
## 3 SAO-all-SRF-0-8-5-00_bin-590 ERR1700891 2.7638506 NAO-all NAO
## 4 SAO-all-SRF-0-8-5-00_bin-590 ERR1700892 1.5574544 IO-all IO
## 5 SAO-all-SRF-0-8-5-00_bin-590 ERR1700893 25.7782555 NAO-all NAO
## 6 SAO-all-SRF-0-8-5-00_bin-590 ERR1700894 10.9937890 MS-all MS
## subregion Depth_sizefrac DEPTH SIZEFRAC sampleid
## 1 all SRF-180-2000.00 SRF 180-2000.00 TARA_039_SRF_180-2000
## 2 all SRF-5-20.00 SRF 5-20.00 TARA_041_SRF_5-20
## 3 all DCM-5-20.00 DCM 5-20.00 TARA_142_DCM_5-20
## 4 all ZZZ-0.8-5.00 ZZZ 0.8-5.00 TARA_040_ZZZ_3->
## 5 all SRF-5-20.00 SRF 5-20.00 TARA_143_SRF_5-20
## 6 all DCM-5-20.00 DCM 5-20.00 TARA_009_DCM_5-20
## [1] 10934 10
## [1] "Genome" "run_accession" "tpm" "Sub_region"
## [5] "REGION" "subregion" "Depth_sizefrac" "DEPTH"
## [9] "SIZEFRAC" "sampleid"
Sum TPM of MAGs based on Size Fraction, Location, and Depth
dictyo_MAG_tpm_location <- dictyo_mag_tpm %>%
filter(!is.na(REGION)) %>%
group_by(mag = Genome, DEPTH, REGION, SIZEFRAC) %>%
summarise(mag_tpm = sum(tpm)) %>%
mutate(log_mag_tpm = log(mag_tpm)) %>%
unite(SAMPLE, REGION, DEPTH, SIZEFRAC, sep = " ", remove = FALSE) %>%
filter(!is.na(mag_tpm))## `summarise()` regrouping output by 'mag', 'DEPTH', 'REGION' (override with `.groups` argument)
## [1] 825 7
Visualize MAG dictyochophyceae with pheatmap - MAG relative abundance.
pheat_dict_mags <- dictyo_MAG_tpm_location %>%
select(mag, SAMPLE, log_mag_tpm) %>%
filter(!is.na(log_mag_tpm)) %>%
mutate_if(is.numeric, function(x) ifelse(is.infinite(x), 0, x)) %>%
pivot_wider(id_cols = SAMPLE, names_from = mag, values_from = log_mag_tpm, values_fill = 0) %>%
column_to_rownames(var = "SAMPLE")## Adding missing grouping variables: `DEPTH`, `REGION`
## `mutate_if()` ignored the following grouping variables:
## Columns `mag`, `DEPTH`, `REGION`
# Obtain annotation
annotate_dictyo <- dictyo_MAG_tpm_location %>%
ungroup() %>%
filter(!is.na(REGION)) %>%
select(SAMPLE, REGION, DEPTH, SIZEFRAC) %>%
distinct() %>%
column_to_rownames(var = "SAMPLE")
# View(annotate_dictyo)
rownames(annotate_dictyo) <- rownames(pheat_dict_mags)
# rownames(pheat_dict_mags)
# unique(annotate_dictyo$REGION)
# unique(annotate_dictyo$DEPTH)
# unique(annotate_dictyo$SIZEFRAC)
# Specify color schema
annotation_colors_dictyo = list(
REGION = c(IO= "#711518", MS= "#ce536b", NAO= "#c76b4a", NPO= "#dfa837", SAO= "#93b778", SO= "#61ac86", SPO= "#657abb", RS= "#67765b"),
DEPTH = c(DCM= "#74a9cf", FSW= "#969696", SRF= "#d0d1e6", MIX= "#0570b0", MES= "#023858", ZZZ= "#252525"),
SIZEFRAC = c(`0.8-5.00` = "#f7fcb9",`180-2000.00` = "#004529", `20-180.00` = "#41ab5d",`5-20.00` = "#addd8e"))MAG relative abundace by sample and TPM.
# svg("figs/pheatmap-MAG-relabun-bysample.svg", h = 17, w = 17)
pheatmap(pheat_dict_mags,
annotation_row = annotate_dictyo,
annotation_colors = annotation_colors_dictyo,
scale = "none",
cluster_cols = TRUE,
cluster_rows = TRUE,
cellwidth = 10, cellheight = 10,
main = "MAG Relative Abundance (TPM)")region_dictyo <- dictyo_MAG_tpm_location %>%
replace_na(list(mag_tpm=0)) %>%
group_by(REGION, mag) %>%
summarise(mag_sum_tpm = sum(mag_tpm))## `summarise()` regrouping output by 'REGION' (override with `.groups` argument)
region_order <- c("IO","MS","NAO","NPO","SAO","SO","SPO","RS")
region_color <- c("#711518", "#ce536b", "#c76b4a", "#dfa837", "#93b778", "#61ac86", "#657abb", "#67765b")
region_dictyo$REGION_ORDER <- factor(region_dictyo$REGION, levels = region_order)
names(region_color) <- region_order
# View(region_dictyo)ggplot(region_dictyo, aes(x = mag, fill = REGION_ORDER, y = mag_sum_tpm)) +
geom_bar(color = "white", stat = "identity") +
scale_fill_manual(values = region_color) +
theme_linedraw() +
coord_flip() +
scale_y_continuous(expand = c(0,0)) +
theme(legend.title = element_blank()) +
labs(y = "MAG Abundance TPM", x = "Putative Dictyochophyte MAGs")— EXCESS –